{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create the baseline results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load data via torch dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer\n",
    "\n",
    "# only to create a valid dataset\n",
    "dummy_tokenizer = BertTokenizer.from_pretrained(\n",
    "    \"bert-base-cased\", add_special_tokens=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.data.components.helsinki import HelsinkiProminenceExtractor\n",
    "from src.data.components.datasets import TokenTaggingDataset\n",
    "from torch.utils.data import DataLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_texts: 116263, test_texts: 4822\n"
     ]
    }
   ],
   "source": [
    "train_extractor = HelsinkiProminenceExtractor(\n",
    "    \"/Users/lukas/Desktop/projects/MIT/prosody/prosody/repositories/helsinki-prosody/data\",\n",
    "    \"train_360.txt\",\n",
    ")\n",
    "train_texts = train_extractor.get_all_texts()\n",
    "train_prominences = train_extractor.get_all_real_prominence()\n",
    "\n",
    "test_extractor = HelsinkiProminenceExtractor(\n",
    "    \"/Users/lukas/Desktop/projects/MIT/prosody/prosody/repositories/helsinki-prosody/data\",\n",
    "    \"test.txt\",\n",
    ")\n",
    "test_texts = test_extractor.get_all_texts()\n",
    "test_prominences = test_extractor.get_all_real_prominence()\n",
    "\n",
    "print(f\"train_texts: {len(train_texts)}, test_texts: {len(test_texts)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = TokenTaggingDataset(\n",
    "    train_texts,\n",
    "    train_prominences,\n",
    "    dummy_tokenizer,\n",
    "    \"bert-cased\",\n",
    "    score_first_token=True,\n",
    "    relative_to_prev=True,\n",
    "    n_prev=3,\n",
    ")\n",
    "\n",
    "test_dataset = TokenTaggingDataset(\n",
    "    test_texts,\n",
    "    test_prominences,\n",
    "    dummy_tokenizer,\n",
    "    \"bert-cased\",\n",
    "    score_first_token=True,\n",
    "    relative_to_prev=True,\n",
    "    n_prev=3,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_sentences = []\n",
    "train_labels = []\n",
    "for i in range(len(train_dataset)):\n",
    "    item = train_dataset.__getitem__(i)\n",
    "    train_sentences.append(item[\"input_text\"])\n",
    "    # print(f\"length split {i}: {len(item['input_text'].split(' '))}\")\n",
    "    # original = np.array(item[\"original_labels\"])\n",
    "    # print(\"original\", original)\n",
    "    mask = np.array(item[\"loss_mask\"])\n",
    "    # print(\"mask\", mask)\n",
    "    labels = np.array(item[\"tokenized_labels\"])\n",
    "    # print(\"labels\", labels)\n",
    "    valid_labels = np.array(labels[mask == 1])\n",
    "    # print(f\"length valid {i}: {len(valid_labels)}\")\n",
    "    # print(\"valid\", valid_labels)\n",
    "    train_labels.append(valid_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_sentences = []\n",
    "test_labels = []\n",
    "for i in range(len(test_dataset)):\n",
    "    item = test_dataset.__getitem__(i)\n",
    "    test_sentences.append(item[\"input_text\"])\n",
    "    # print(f\"length split {i}: {len(item['input_text'].split(' '))}\")\n",
    "    # original = np.array(item[\"original_labels\"])\n",
    "    # print(\"original\", original)\n",
    "    mask = np.array(item[\"loss_mask\"])\n",
    "    # print(\"mask\", mask)\n",
    "    labels = np.array(item[\"tokenized_labels\"])\n",
    "    # print(\"labels\", labels)\n",
    "    valid_labels = np.array(labels[mask == 1])\n",
    "    # print(f\"length valid {i}: {len(valid_labels)}\")\n",
    "    # print(\"valid\", valid_labels)\n",
    "    test_labels.append(valid_labels)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Process data\n",
    "#### Remove punctuation, lowercase everything "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['for man of you your characteristic race here may he hardy sweet gigantic grow here tower proportionate to nature here climb the vast pure spaces unconfined uncheckd by wall or roof here laugh with storm or sun here joy here patiently inure here heed himself unfold himself not others formulas heed here fill his time to duly fall to aid last to disappear to serve',\n",
       " 'tom the pipers son',\n",
       " 'tom tom the pipers son stole a pig and away he run the pig was eat and tom was beat and tom ran crying down the street',\n",
       " 'there was not a worse vagabond in shrewsbury than old barney the piper',\n",
       " 'he never did any work except to play the pipes and he played so badly that few pennies ever found their way into his pouch']"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from src.utils.text_processing import python_lowercase_remove_punctuation\n",
    "\n",
    "train_sentences = python_lowercase_remove_punctuation(train_sentences)\n",
    "test_sentences = python_lowercase_remove_punctuation(test_sentences)\n",
    "\n",
    "train_sentences[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lengths of train and test set: 2075946 and 90050, respectively\n",
      "Lengths of train and test labels: 2075946 and 90050, respectively\n"
     ]
    }
   ],
   "source": [
    "# create a list of all words in the training set\n",
    "train_words = [word for sentence in train_sentences for word in sentence.split(\" \")]\n",
    "train_prominences = [prominence for sentence in train_labels for prominence in sentence]\n",
    "\n",
    "# create a list of all words in the test set\n",
    "test_words = [word for sentence in test_sentences for word in sentence.split(\" \")]\n",
    "test_prominences = [prominence for sentence in test_labels for prominence in sentence]\n",
    "\n",
    "print(\n",
    "    f\"Lengths of train and test set: {len(train_words)} and {len(test_words)}, respectively\"\n",
    ")\n",
    "print(\n",
    "    f\"Lengths of train and test labels: {len(train_prominences)} and {len(test_prominences)}, respectively\"\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simple Models"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random predictions "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dummy model \n",
      "Mean squared error: 3.704456719075822\n"
     ]
    }
   ],
   "source": [
    "from src.models.baselines.dummy_models import DummyModel\n",
    "\n",
    "dummy_model = DummyModel(train_prominences, nb_sig=3)\n",
    "print(f\"Dummy model \")\n",
    "\n",
    "predictions = dummy_model.predict(len(test_prominences))\n",
    "\n",
    "# compute mse\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "mse = mean_squared_error(test_prominences, predictions)\n",
    "print(f\"Mean squared error: {mse}\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Avg of all word in corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average difference in prominence: 0.1367394704229622\n",
      "Mean squared error: 0.967241405994297\n"
     ]
    }
   ],
   "source": [
    "avg_difference = np.mean(train_prominences)\n",
    "print(f\"Average difference in prominence: {avg_difference}\")\n",
    "\n",
    "# compute mse\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "predictions = [avg_difference] * len(test_prominences)\n",
    "mse = mean_squared_error(test_prominences, predictions)\n",
    "print(f\"Mean squared error: {mse}\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Corpus statistics: predict average diff per word "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error: 0.6902478109049962\n"
     ]
    }
   ],
   "source": [
    "# collect the words and their prominence scores\n",
    "word_prominence = {}\n",
    "for word, prominence in zip(train_words, train_prominences):\n",
    "    if word not in word_prominence:\n",
    "        word_prominence[word] = []\n",
    "    word_prominence[word].append(prominence)\n",
    "\n",
    "# compute the average prominence score for each word\n",
    "word_prominence_avg = {}\n",
    "for word, prominence in word_prominence.items():\n",
    "    word_prominence_avg[word] = np.mean(prominence)\n",
    "\n",
    "# for each word in the test set, get the average prominence score\n",
    "predictions = []\n",
    "for word in test_words:\n",
    "    if word in word_prominence_avg:\n",
    "        predictions.append(word_prominence_avg[word])\n",
    "    else:\n",
    "        predictions.append(avg_difference)\n",
    "\n",
    "# compute mse\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "mse = mean_squared_error(test_prominences, predictions)\n",
    "print(f\"Mean squared error: {mse}\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GloVe Embedding Baseline\n",
    "#### Expects Embeddings already downloaded "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from prosody.src.models.sklearn.sklearn_models import train_sklearn_regressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "weight_dir = \"/Users/lukas/Desktop/projects/MIT/MIT_prosody/precomputed/glove\"\n",
    "\n",
    "vocab, embeddings = [], []\n",
    "with open(os.path.join(weight_dir, \"glove.6B.300d.txt\"), \"rt\") as fi:\n",
    "    full_content = fi.read().strip().split(\"\\n\")\n",
    "for i in range(len(full_content)):\n",
    "    i_word = full_content[i].split(\" \")[0]\n",
    "    i_embeddings = [float(val) for val in full_content[i].split(\" \")[1:]]\n",
    "    vocab.append(i_word)\n",
    "    embeddings.append(i_embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shapes vocab: (400000,)  embeddings: (400000, 300)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "vocab_npa = np.array(vocab)\n",
    "embs_npa = np.array(embeddings)\n",
    "\n",
    "print(f\"Shapes vocab: {vocab_npa.shape}  embeddings: {embs_npa.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']\n",
      "(400002, 300)\n"
     ]
    }
   ],
   "source": [
    "vocab_npa = np.insert(vocab_npa, 0, \"<pad>\")\n",
    "vocab_npa = np.insert(vocab_npa, 1, \"<unk>\")\n",
    "print(vocab_npa[:10])\n",
    "\n",
    "pad_emb_npa = np.zeros((1, embs_npa.shape[1]))  # embedding for '<pad>' token.\n",
    "unk_emb_npa = np.mean(embs_npa, axis=0, keepdims=True)  # embedding for '<unk>' token.\n",
    "\n",
    "# insert embeddings for pad and unk tokens at top of embs_npa.\n",
    "embs_npa = np.vstack((pad_emb_npa, unk_emb_npa, embs_npa))\n",
    "print(embs_npa.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([400002, 300])\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "my_embedding_layer = torch.nn.Embedding.from_pretrained(\n",
    "    torch.from_numpy(embs_npa).float()\n",
    ")\n",
    "\n",
    "assert my_embedding_layer.weight.shape == embs_npa.shape\n",
    "print(my_embedding_layer.weight.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "168\n"
     ]
    }
   ],
   "source": [
    "word_to_idx = {word: i for i, word in enumerate(vocab_npa)}\n",
    "idx_to_word = {i: word for i, word in enumerate(vocab_npa)}\n",
    "\n",
    "print(word_to_idx[\"house\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lengths of train and test set: 2075946 and 90050, respectively\n"
     ]
    }
   ],
   "source": [
    "# Create training and test data based on embedding of the word\n",
    "train_data = []\n",
    "for word in train_words:\n",
    "    if word in word_to_idx:\n",
    "        train_data.append(my_embedding_layer(torch.tensor(word_to_idx[word])))\n",
    "    else:\n",
    "        train_data.append(my_embedding_layer(torch.tensor(word_to_idx[\"<unk>\"])))\n",
    "\n",
    "test_data = []\n",
    "for word in test_words:\n",
    "    if word in word_to_idx:\n",
    "        test_data.append(my_embedding_layer(torch.tensor(word_to_idx[word])))\n",
    "    else:\n",
    "        test_data.append(my_embedding_layer(torch.tensor(word_to_idx[\"<unk>\"])))\n",
    "\n",
    "print(\n",
    "    f\"Lengths of train and test set: {len(train_data)} and {len(test_data)}, respectively\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[62], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m train_data \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mstack(train_data)\n\u001b[1;32m      2\u001b[0m test_data \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mstack(test_data)\n\u001b[1;32m      4\u001b[0m train_data\u001b[39m.\u001b[39mshape, train_labels\u001b[39m.\u001b[39mshape\n",
      "\u001b[0;31mTypeError\u001b[0m: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor"
     ]
    }
   ],
   "source": [
    "train_data = torch.stack(train_data)\n",
    "test_data = torch.stack(test_data)\n",
    "\n",
    "train_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(torch.Size([2075946]), torch.Size([90050]))"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_labels = torch.tensor(train_prominences)\n",
    "test_labels = torch.tensor(test_prominences)\n",
    "\n",
    "train_labels.shape, test_labels.shape"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GloVe: sklearn dummy regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error: 0.967241405994297\n"
     ]
    }
   ],
   "source": [
    "from sklearn.dummy import DummyRegressor\n",
    "\n",
    "dummy_model = DummyRegressor(strategy=\"mean\")\n",
    "dummy_model.fit(train_data, train_labels)\n",
    "\n",
    "predictions = dummy_model.predict(test_data)\n",
    "\n",
    "# compute mse\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "mse = mean_squared_error(test_labels, predictions)\n",
    "print(f\"Mean squared error: {mse}\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GloVe: sklearn linear regression "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error: 0.7338721230640632\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "linear_model = LinearRegression()\n",
    "linear_model.fit(train_data, train_labels)\n",
    "\n",
    "predictions = linear_model.predict(test_data)\n",
    "\n",
    "# compute mse\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "mse = mean_squared_error(test_labels, predictions)\n",
    "print(f\"Mean squared error: {mse}\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GloVe: sklearn histgradboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error: 0.677030123714282\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import HistGradientBoostingRegressor\n",
    "\n",
    "hist_model = HistGradientBoostingRegressor()\n",
    "hist_model.fit(train_data, train_labels)\n",
    "\n",
    "predictions = hist_model.predict(test_data)\n",
    "\n",
    "# compute mse\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "mse = mean_squared_error(test_labels, predictions)\n",
    "print(f\"Mean squared error: {mse}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "prosody",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
